import json

# 输入输出文件路径
input_file = "processed_NewGZ_tag/train/cleaned_train2.json"  # 修改为你的 JSON 路径
output_file = "filtered_tag_field_length_gt10.json"

# 加载数据
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# 保留 tag 中至少有一个字段长度 > 10 的记录
filtered = []
for item in data:
    tag_fields = item["tag"].split(",")
    if any(len(field.strip()) > 14 for field in tag_fields):
        filtered.append(item)

# 保存结果
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(filtered, f, ensure_ascii=False, indent=2)

print(f"共提取 {len(filtered)} 条记录（含字段长度大于 10 的 tag），已保存至 {output_file}")
